This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.
Install Libraries
library(dplyr)
library(ggplot2)
library(reshape2)
library(RColorBrewer)
library(scales)
library(boot)
Load and Transform Data
nba_data <- read.csv("nba_2022-23_all_stats_with_salary.csv")
full_statistics <- nba_data %>% select(Salary, GP:VORP)
head(full_statistics)
## Salary GP GS MP FG FGA FG. X3P X3PA X3P. X2P X2PA X2P. eFG. FT
## 1 48070014 56 56 34.7 10.0 20.2 0.493 4.9 11.4 0.427 5.1 8.8 0.579 0.614 4.6
## 2 47345760 34 3 22.2 4.1 9.9 0.408 1.0 3.2 0.303 3.1 6.7 0.459 0.457 2.3
## 3 47080179 73 24 29.1 5.9 13.6 0.436 1.2 3.9 0.311 4.7 9.7 0.487 0.481 2.8
## 4 44474988 55 54 35.5 11.1 22.2 0.500 2.2 6.9 0.321 8.9 15.3 0.580 0.549 4.6
## 5 44119845 47 47 35.6 10.3 18.3 0.560 2.0 4.9 0.404 8.3 13.4 0.617 0.614 6.5
## 6 43279250 50 50 33.5 8.9 17.6 0.506 1.6 4.4 0.365 7.3 13.2 0.552 0.551 3.8
## FTA FT. ORB DRB TRB AST STL BLK TOV PF PTS Total.Minutes PER TS. X3PAr
## 1 5.0 0.915 0.7 5.4 6.1 6.3 0.9 0.4 3.2 2.1 29.4 1941 24.1 0.656 0.564
## 2 3.3 0.681 0.4 2.3 2.7 5.2 0.8 0.4 2.4 1.7 11.4 755 13.6 0.498 0.322
## 3 4.3 0.656 1.2 4.6 5.8 7.5 1.0 0.5 3.5 2.2 15.9 2126 16.1 0.513 0.289
## 4 5.9 0.768 1.2 7.1 8.3 6.8 0.9 0.6 3.2 1.6 28.9 1954 23.9 0.583 0.309
## 5 7.1 0.919 0.4 6.3 6.7 5.0 0.7 1.4 3.3 2.1 29.1 1672 25.9 0.677 0.267
## 6 4.6 0.842 0.8 3.1 3.9 5.4 0.9 0.7 2.9 2.1 23.2 1673 19.7 0.593 0.249
## FTr ORB. DRB. TRB. AST. STL. BLK. TOV. USG. OWS DWS WS WS.48 OBPM DBPM
## 1 0.248 2.3 16.8 9.7 30.0 1.3 0.9 12.5 31.0 5.8 2.0 7.8 0.192 7.5 0.1
## 2 0.334 2.1 11.4 6.8 35.3 1.8 1.4 17.1 27.0 -0.4 0.7 0.3 0.020 -0.8 -0.4
## 3 0.317 4.7 16.5 10.8 38.6 1.7 1.3 18.4 27.7 -0.6 2.6 1.9 0.044 0.3 -0.1
## 4 0.268 3.7 20.8 12.5 33.5 1.2 1.4 11.6 33.3 3.2 2.4 5.6 0.138 5.5 0.6
## 5 0.387 1.2 19.5 10.5 24.5 1.0 3.4 13.4 30.7 4.7 2.1 6.8 0.194 6.0 1.2
## 6 0.260 2.8 9.9 6.5 26.6 1.3 1.7 12.9 29.2 2.2 1.2 3.4 0.099 2.9 -1.2
## BPM VORP
## 1 7.5 4.7
## 2 -1.2 0.1
## 3 0.2 1.2
## 4 6.1 4.0
## 5 7.1 3.9
## 6 1.8 1.6
basic_statistics <- nba_data %>% select(Salary, GP:X2P., FT:Total.Minutes)
head(basic_statistics)
## Salary GP GS MP FG FGA FG. X3P X3PA X3P. X2P X2PA X2P. FT FTA
## 1 48070014 56 56 34.7 10.0 20.2 0.493 4.9 11.4 0.427 5.1 8.8 0.579 4.6 5.0
## 2 47345760 34 3 22.2 4.1 9.9 0.408 1.0 3.2 0.303 3.1 6.7 0.459 2.3 3.3
## 3 47080179 73 24 29.1 5.9 13.6 0.436 1.2 3.9 0.311 4.7 9.7 0.487 2.8 4.3
## 4 44474988 55 54 35.5 11.1 22.2 0.500 2.2 6.9 0.321 8.9 15.3 0.580 4.6 5.9
## 5 44119845 47 47 35.6 10.3 18.3 0.560 2.0 4.9 0.404 8.3 13.4 0.617 6.5 7.1
## 6 43279250 50 50 33.5 8.9 17.6 0.506 1.6 4.4 0.365 7.3 13.2 0.552 3.8 4.6
## FT. ORB DRB TRB AST STL BLK TOV PF PTS Total.Minutes
## 1 0.915 0.7 5.4 6.1 6.3 0.9 0.4 3.2 2.1 29.4 1941
## 2 0.681 0.4 2.3 2.7 5.2 0.8 0.4 2.4 1.7 11.4 755
## 3 0.656 1.2 4.6 5.8 7.5 1.0 0.5 3.5 2.2 15.9 2126
## 4 0.768 1.2 7.1 8.3 6.8 0.9 0.6 3.2 1.6 28.9 1954
## 5 0.919 0.4 6.3 6.7 5.0 0.7 1.4 3.3 2.1 29.1 1672
## 6 0.842 0.8 3.1 3.9 5.4 0.9 0.7 2.9 2.1 23.2 1673
#Used this link: https://faq.stathead.com/knowledge/basketball-advanced-stats to determine the advanced statistics
advanced_statistics <- nba_data %>% select(Salary, eFG., PER:VORP)
head(advanced_statistics)
## Salary eFG. PER TS. X3PAr FTr ORB. DRB. TRB. AST. STL. BLK. TOV. USG.
## 1 48070014 0.614 24.1 0.656 0.564 0.248 2.3 16.8 9.7 30.0 1.3 0.9 12.5 31.0
## 2 47345760 0.457 13.6 0.498 0.322 0.334 2.1 11.4 6.8 35.3 1.8 1.4 17.1 27.0
## 3 47080179 0.481 16.1 0.513 0.289 0.317 4.7 16.5 10.8 38.6 1.7 1.3 18.4 27.7
## 4 44474988 0.549 23.9 0.583 0.309 0.268 3.7 20.8 12.5 33.5 1.2 1.4 11.6 33.3
## 5 44119845 0.614 25.9 0.677 0.267 0.387 1.2 19.5 10.5 24.5 1.0 3.4 13.4 30.7
## 6 43279250 0.551 19.7 0.593 0.249 0.260 2.8 9.9 6.5 26.6 1.3 1.7 12.9 29.2
## OWS DWS WS WS.48 OBPM DBPM BPM VORP
## 1 5.8 2.0 7.8 0.192 7.5 0.1 7.5 4.7
## 2 -0.4 0.7 0.3 0.020 -0.8 -0.4 -1.2 0.1
## 3 -0.6 2.6 1.9 0.044 0.3 -0.1 0.2 1.2
## 4 3.2 2.4 5.6 0.138 5.5 0.6 6.1 4.0
## 5 4.7 2.1 6.8 0.194 6.0 1.2 7.1 3.9
## 6 2.2 1.2 3.4 0.099 2.9 -1.2 1.8 1.6
Plot Salaries by Position
nba_data$Position <- factor(nba_data$Position, levels = c("PG", "PG-SG", "SG", "SG-PG", "SF", "SF-SG", "SF-PF", "PF", "C"))
ggplot(nba_data, aes(x = Position, y = Salary)) +
geom_boxplot(outlier.shape = 1) +
scale_y_continuous(labels = label_comma()) +
labs(title = "Salaries by Position", x = "Position", y = "Salary") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 12),
axis.text.x = element_text(angle = 45, hjust = 1)
)
Perform ANOVA Test
#If the p-value is less than the significance level (0.05), conclude that there is a statistically significant difference in salaries among the different positions
anova_model <- aov(Salary ~ Position, data = nba_data)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## Position 8 1.794e+15 2.243e+14 1.989 0.0462 *
## Residuals 458 5.164e+16 1.127e+14
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Perform Tukey’s Test
#Use the results to identify the positions with statistically significant differences in mean salaries
tukey_results <- TukeyHSD(anova_model)
tukey_results
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Salary ~ Position, data = nba_data)
##
## $Position
## diff lwr upr p adj
## PG-SG-PG 9878955.0 -13824077 33581986.87 0.9314209
## SG-PG -4898272.3 -9771392 -25152.95 0.0476956
## SG-PG-PG 5071233.5 -18631798 28774265.37 0.9991405
## SF-PG -3447320.6 -8571677 1677035.46 0.4768417
## SF-SG-PG 660189.5 -23042842 24363221.37 1.0000000
## SF-PF-PG -8579573.5 -41887888 24728740.78 0.9967653
## PF-PG -2694528.1 -7886710 2497654.14 0.7949870
## C-PG -4296851.4 -9421207 827504.68 0.1838278
## SG-PG-SG -14777227.3 -38380908 8826453.19 0.5783584
## SG-PG-PG-SG -4807721.5 -37901832 28286389.15 0.9999528
## SF-PG-SG -13326275.6 -36983103 10330551.45 0.7113125
## SF-SG-PG-SG -9218765.5 -42312876 23875345.15 0.9944478
## SF-PF-PG-SG -18458528.5 -58990371 22073313.80 0.8900961
## PF-PG-SG -12573483.2 -36245095 11098128.50 0.7731829
## C-PG-SG -14175806.4 -37832633 9481020.67 0.6364911
## SG-PG-SG 9969505.8 -13634175 33573186.33 0.9262137
## SF-SG 1450951.7 -3192220 6094123.03 0.9880391
## SF-SG-SG 5558461.8 -18045219 29162142.33 0.9982826
## SF-PF-SG -3681301.2 -36918988 29556385.46 0.9999942
## PF-SG 2203744.2 -2514176 6921664.44 0.8750890
## C-SG 601420.9 -4041750 5244592.25 0.9999805
## SF-SG-PG -8518554.1 -32175381 15138272.95 0.9707026
## SF-SG-SG-PG -4411044.0 -37505155 28683066.65 0.9999757
## SF-PF-SG-PG -13650807.0 -54182649 26881035.30 0.9806514
## PF-SG-PG -7765761.7 -31437373 15905850.00 0.9836239
## C-SG-PG -9368084.9 -33024912 14288742.17 0.9485794
## SF-SG-SF 4107510.1 -19549317 27764337.17 0.9998172
## SF-PF-SF -5132252.9 -38407702 28143196.71 0.9999255
## PF-SF 752792.5 -4224205 5729790.00 0.9999358
## C-SF -849530.8 -5755728 4056666.60 0.9998209
## SF-PF-SF-SG -9239763.0 -49771605 31292079.30 0.9986393
## PF-SF-SG -3354717.7 -27026329 20316894.00 0.9999610
## C-SF-SG -4957040.9 -28613868 18699786.17 0.9992617
## PF-SF-PF 5885045.3 -27400917 39171007.53 0.9997905
## C-SF-PF 4282722.1 -28992727 37558171.71 0.9999814
## C-PF -1602323.2 -6579321 3374674.30 0.9854873
Plot Results of Tukey’s Test
tukey_data <- as.data.frame(TukeyHSD(anova_model)$Position)
tukey_data$Comparison <- row.names(tukey_data)
ggplot(tukey_data, aes(x = Comparison, y = diff)) +
geom_errorbar(aes(ymin = lwr, ymax = upr), width = 0.2) +
geom_point() +
coord_flip() +
labs(x = "Difference in Mean Levels of Position", y = "", title = "95% Family-Wise Confidence Level") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 12),
axis.text.x = element_text(size = 6, hjust = 1, angle = 45),
axis.text.y = element_text(size = 6, hjust = 1)
) +
scale_y_continuous(labels = label_comma())
Generate Correlation Heatmap for Basic Statistics
basic_correlation_matrix <- cor(basic_statistics, use = "complete.obs")
melted_basic_correlation_matrix <- melt(basic_correlation_matrix)
ggplot(data = melted_basic_correlation_matrix, aes(Var1, Var2, fill = value)) +
geom_tile() +
#Specified HEX color values to match NBA logo color scheme
scale_fill_gradient2(low = "#da1a33", mid = "#ffffff", high = "#00428c",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 12),
axis.text.x = element_text(size = 6, hjust = 1, vjust = 1, angle = 45),
axis.text.y = element_text(size = 6),
legend.key.size = unit(0.5, "cm"),
legend.title = element_text(size = 8),
legend.text = element_text(size = 6)
) +
labs(title = "Correlation Heatmap of Basic Statistics and Salaries", x = "", y = "") +
geom_text(aes(label = sprintf("%.2f", value)), size = 2)
Generate Correlation Heatmap for Advanced Statistics
advanced_correlation_matrix <- cor(advanced_statistics, use = "complete.obs")
melted_advanced_correlation_matrix <- melt(advanced_correlation_matrix)
ggplot(data = melted_advanced_correlation_matrix, aes(Var1, Var2, fill = value)) +
geom_tile() +
#Specified HEX color values to match NBA logo color scheme
scale_fill_gradient2(low = "#da1a33", mid = "#ffffff", high = "#00428c",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 12),
axis.text.x = element_text(size = 6, hjust = 1, vjust = 1, angle = 45),
axis.text.y = element_text(size = 6),
legend.key.size = unit(0.5, "cm"),
legend.title = element_text(size = 8),
legend.text = element_text(size = 6)
) +
labs(title = "Correlation Heatmap of Advanced Statistics and Salaries", x = "", y = "") +
geom_text(aes(label = sprintf("%.2f", value)), size = 2)
Generate Correlation Heatmap for Full Statistics
full_correlation_matrix <- cor(full_statistics, use = "complete.obs")
melted_full_correlation_matrix <- melt(full_correlation_matrix)
ggplot(data = melted_full_correlation_matrix, aes(Var1, Var2, fill = value)) +
geom_tile() +
#Specified HEX color values to match NBA logo color scheme
scale_fill_gradient2(low = "#da1a33", mid = "#ffffff", high = "#00428c",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 12),
axis.text.x = element_text(size = 4, hjust = 1, vjust = 1, angle = 45),
axis.text.y = element_text(size = 4),
legend.key.size = unit(0.5, "cm"),
legend.title = element_text(size = 8),
legend.text = element_text(size = 6)
) +
labs(title = "Correlation Heatmap of Full Statistics and Salaries", x = "", y = "") +
geom_text(aes(label = sprintf("%.2f", value)), size = 1)
View Correlations Between Full Statistics and Salaries (in Descending Order)
salary_correlations <- full_correlation_matrix["Salary", ]
sorted_salary_correlations <- sort(salary_correlations, decreasing = TRUE)
sorted_salary_correlations
## Salary PTS FG FGA VORP
## 1.0000000000 0.7191173336 0.7118509280 0.6966823647 0.6768492637
## X2PA X2P FT FTA TOV
## 0.6715820234 0.6675550693 0.6645510390 0.6595895988 0.6431081241
## MP WS OBPM AST GS
## 0.6417385312 0.6162372741 0.6000296867 0.5891035926 0.5889257613
## OWS DRB BPM USG. DWS
## 0.5765526833 0.5677576398 0.5577456362 0.5565113689 0.5498690356
## Total.Minutes PER TRB X3PA X3P
## 0.5463765746 0.5373134923 0.4925254786 0.4739628546 0.4621655033
## STL AST. PF WS.48 GP
## 0.4616304850 0.4547321061 0.3861162032 0.3448925913 0.3009336031
## BLK ORB TS. FTr FT.
## 0.2908862283 0.2002398018 0.1920965177 0.1750457743 0.1565122461
## FG. eFG. DRB. DBPM X3P.
## 0.1294987211 0.1204278512 0.1147237989 0.1121376540 0.0667153966
## X2P. TRB. STL. BLK. TOV.
## 0.0439189198 0.0207517590 -0.0003385747 -0.0465065932 -0.0551807178
## X3PAr ORB.
## -0.1173386874 -0.1368441671
Fit Multiple Regression Model
linear_model <- lm(Salary ~ PTS + FG + VORP, data = nba_data)
summary(linear_model)
##
## Call:
## lm(formula = Salary ~ PTS + FG + VORP, data = nba_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23837506 -3191890 -603193 2088031 38410717
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -148496 615409 -0.241 0.8094
## PTS 701183 381109 1.840 0.0664 .
## FG 198124 1046821 0.189 0.8500
## VORP 2777388 427849 6.492 2.19e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7055000 on 463 degrees of freedom
## Multiple R-squared: 0.5688, Adjusted R-squared: 0.566
## F-statistic: 203.6 on 3 and 463 DF, p-value: < 2.2e-16
#Referenced this link: https://www.statology.org/r-vs-r-squared/ to approximate the model correlation
model_correlation <- sqrt(summary(linear_model)$r.squared)
model_correlation
## [1] 0.754166
Plot Multiple Regression Model Diagnostics (for Analysis of Outliers or High Leverage Observations)
par(mfrow=c(2,2))
plot(linear_model)
Plot Multiple Regression Model (Actual vs. Predicted Values)
nba_data$predicted_salary <- predict(linear_model, newdata = nba_data)
ggplot(nba_data, aes(x = Salary, y = predicted_salary)) +
#Specified HEX color values to match NBA logo color scheme
geom_point(alpha = 0.5, color = "#00428c") +
geom_abline(intercept = 0, slope = 1, color = "#da1a33") +
scale_x_continuous(labels = label_comma()) +
scale_y_continuous(labels = label_comma()) +
labs(x = "Actual Salaries", y = "Predicted Salaries", title = "Actual vs. Predicted Salaries") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 12),
axis.title.x = element_text(size = 10, margin = margin(t = 10, unit = "pt")),
axis.title.y = element_text(size = 10, margin = margin(r = 10, unit = "pt")),
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 6),
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm")
)
Perform Bootstrapping on Multiple Regression Model
#Referenced this link: https://www.statmethods.net/advstats/bootstrapping.html for information on bootstrapping
bs <- function(formula, data, indices) {
d <- data[indices, ]
model <- lm(formula, data = d)
return(coef(model))
}
set.seed(123)
bootstrap_results <- boot(data = nba_data, statistic = bs, R = 1000, formula = Salary ~ PTS + FG + VORP)
bootstrap_results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = nba_data, statistic = bs, R = 1000, formula = Salary ~
## PTS + FG + VORP)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -148496.1 19296.47 506516.6
## t2* 701183.3 20536.56 435911.6
## t3* 198124.3 -64345.66 1204520.8
## t4* 2777387.9 16439.01 503068.9
Derive Confidence Intervals from Bootstrap Results
boot.ci(bootstrap_results, type = "bca", index = 1) #Intercept
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = bootstrap_results, type = "bca", index = 1)
##
## Intervals :
## Level BCa
## 95% (-1145464, 823691 )
## Calculations and Intervals on Original Scale
boot.ci(bootstrap_results, type = "bca", index = 2) #PTS
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = bootstrap_results, type = "bca", index = 2)
##
## Intervals :
## Level BCa
## 95% (-206913, 1555303 )
## Calculations and Intervals on Original Scale
boot.ci(bootstrap_results, type = "bca", index = 3) #FG
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = bootstrap_results, type = "bca", index = 3)
##
## Intervals :
## Level BCa
## 95% (-2090459, 2746448 )
## Calculations and Intervals on Original Scale
boot.ci(bootstrap_results, type = "bca", index = 4) #VORP
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = bootstrap_results, type = "bca", index = 4)
##
## Intervals :
## Level BCa
## 95% (1798436, 3715748 )
## Calculations and Intervals on Original Scale
Plot Bootstrap Results
plot(bootstrap_results, index = 1) #Intercept
plot(bootstrap_results, index = 2) #PTS
plot(bootstrap_results, index = 3) #FG
plot(bootstrap_results, index = 4) #VORP